see code
library(gt)
library(gtExtras)
data.frame(section = c("Importing Libraries","Importing Data","Open Text Cleaning",
"List to Vec","Completeness Check"),
script_sample =c('lapply(as.list(c("tidyverse","janitor","arrow",
"ggthemes","tidytext","tidymodels",
"textrecipes","glmnet")),
require,character.only=TRUE) |>
suppressWarnings() |>
suppressMessages() |>
invisible()','read_csv(file = "./Data/Ask A Manager Salary Survey 2022_Responses.csv") |>
clean_names() |>
mutate(timestamp = mdy_hms(timestamp)) |>
filter(!is.na(job_title)) |>
mutate(additional_compensation = ifelse(is.na(additional_compensation),
0,additional_compensation),
full_compensation = salary+additional_compensation) |>
filter(!currency %in% c("Other","HKD")) |>
rownames_to_column("respondent")','text_cleaner <- function(a_vec){
tokens <- str_split(a_vec,pattern="\\s{1,}") |>
unlist()
tokens <- tokens[!grepl("\\s{1,}|[[:punct:]]|\\d{1,}",tokens)] |> str_to_lower()
tokens <- unique(tokens)
tokens <- tokens[!tokens %in% c(stopwords::data_stopwords_nltk[["en"]])]
return(tokens)}','list_to_vec <- function(a_vec){
new_vec <- a_vec |>
as.character()
new_vec <- gsub("(^c+[[:punct:]])|[[:punct:]]",
replacement = "",
new_vec)
return(new_vec)}
','completeness <- function(a_vec){
incomplete <- round(100/length(a_vec)*a_vec[is.na(a_vec)] |> length(),2)
return(incomplete)}
'),
reasoning = c("importing libraries required for preprocessing",
"import data, convert date column to date type, remove unviable cases and create total compensation columns",
"split open-text fields into tokens, remove punctuation, spaces and stopwords. The function returns a list of words per case",
"Convert the list of words per case to a vector.",
"Check the level of missing values in each column.")) |>
gt() |>
gt_theme_nytimes() |>
tab_header(title = "2022 Data Clean Script Overview",
subtitle = "An overview of sections in the data cleaning script") |>
opt_align_table_header(align = "center") |>
tab_style(style = list(
cell_fill(color= "#490E6F",alpha=0.8),
cell_text(color = "white",font=google_font("Fira Code"),
align = "center",style = "normal",
weight= "lighter",whitespace = "pre-line")
),
locations= cells_body(
columns = script_sample
))
2022 Data Clean Script Overview | ||
---|---|---|
An overview of sections in the data cleaning script | ||
section | script_sample | reasoning |
Importing Libraries | lapply(as.list(c("tidyverse","janitor","arrow", "ggthemes","tidytext","tidymodels", "textrecipes","glmnet")), require,character.only=TRUE) |> suppressWarnings() |> suppressMessages() |> invisible() | importing libraries required for preprocessing |
Importing Data | read_csv(file = "./Data/Ask A Manager Salary Survey 2022_Responses.csv") |> clean_names() |> mutate(timestamp = mdy_hms(timestamp)) |> filter(!is.na(job_title)) |> mutate(additional_compensation = ifelse(is.na(additional_compensation), 0,additional_compensation), full_compensation = salary+additional_compensation) |> filter(!currency %in% c("Other","HKD")) |> rownames_to_column("respondent") | import data, convert date column to date type, remove unviable cases and create total compensation columns |
Open Text Cleaning | text_cleaner <- function(a_vec){ tokens <- str_split(a_vec,pattern="\s{1,}") |> unlist() tokens <- tokens[!grepl("\s{1,}|[[:punct:]]|\d{1,}",tokens)] |> str_to_lower() tokens <- unique(tokens) tokens <- tokens[!tokens %in% c(stopwords::data_stopwords_nltk[["en"]])] return(tokens)} | split open-text fields into tokens, remove punctuation, spaces and stopwords. The function returns a list of words per case |
List to Vec | list_to_vec <- function(a_vec){ new_vec <- a_vec |> as.character() new_vec <- gsub("(^c+[[:punct:]])|[[:punct:]]", replacement = "", new_vec) return(new_vec)} | Convert the list of words per case to a vector. |
Completeness Check | completeness <- function(a_vec){ incomplete <- round(100/length(a_vec)*a_vec[is.na(a_vec)] |> length(),2) return(incomplete)} | Check the level of missing values in each column. |
A table containing 2022 Data Cleaner sections